@article{rosenblatt1958perceptron,
  title={The perceptron: a probabilistic model for information storage and organization in the brain.},
  author={Rosenblatt, Frank},
  journal={Psychological Review},
  volume={65},
  number={6},
  pages={386},
  year={1958},
  publisher={American Psychological Association}
}

@article{lecun1989backpropagation,
  title={Backpropagation applied to handwritten zip code recognition},
  author={LeCun, Yann and Boser, Bernhard and Denker, John S and Henderson, Donnie and Howard, Richard E and Hubbard, Wayne and Jackel, Lawrence D},
  journal={Neural computation},
  volume={1},
  number={4},
  pages={541--551},
  year={1989},
  publisher={MIT Press}
}

@inproceedings{krizhevsky2012imagenet,
  title={Imagenet classification with deep convolutional neural networks},
  author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
  booktitle={Advances in Neural Information Processing Systems},
  pages={1097--1105},
  year={2012}
}

@inproceedings{he2016deep,
	title={{Deep Residual Learning for Image Recognition}},
	author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
	booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
	year={2016}
}

@article{rumelhart1986learning,
  title={Learning representations by back-propagating errors},
  author={Rumelhart, David E and Hinton, Geoffrey E and Williams, Ronald J},
  journal={Nature},
  volume={323},
  number={6088},
  pages={533},
  year={1986},
  publisher={Nature Publishing Group}
}

@article{Hochreiter1997lstm,
	author = {Hochreiter, Sepp and Hochreiter, S and Schmidhuber, J{\"{u}}rgen and Schmidhuber, J},
	isbn = {08997667 (ISSN)},
	issn = {0899-7667},
	journal = {Neural Computation},
	number = {8},
	pages = {1735--80},
	pmid = {9377276},
	title = {{Long Short-Term Memory.}},
	volume = {9},
	year = {1997}
}

@inproceedings{vaswani2017attention,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  booktitle={Advances in Neural Information Processing Systems},
  pages={5998--6008},
  year={2017}
}

@article{lecun2015deep,
	title={Deep learning},
	author={LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
	journal={Nature},
	volume={521},
	number={7553},
	pages={436},
	year={2015},
	publisher={Nature Publishing Group}
}

@inproceedings{KingmaAdam2014,
	title = {{Adam}: A Method for Stochastic Optimization},
	author = {Kingma, Diederik and Ba, Jimmy},
	booktitle = {Proceedings of the International Conference on Learning Representations (ICLR)},
	year = {2014}
}

@techreport{tieleman2012rmsprop,
	title={Divide the gradient by a running average of its recent magnitude. COURSERA: Neural networks for machine learning},
	author={Tieleman, T and Hinton, G},
	year={2017},
	institution={Technical Report}
}

@article{duchi2011adagrad,
	title={Adaptive subgradient methods for online learning and stochastic optimization},
	author={Duchi, John and Hazan, Elad and Singer, Yoram},
	journal={Journal of Machine Learning Research (JMLR)},
	volume={12},
	number={Jul},
	pages={2121--2159},
	year={2011}
}